F-test and statistical significance with python

Author: Leonardo Espin

Date: 4/22/2019

In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression
from scipy.stats import f
import matplotlib.pyplot as plt
#import matplotlib.style
%matplotlib inline
plt.rcParams['figure.figsize'] = [8, 5]
In [2]:
#generating sinthetic data
a=1.2
b=0.5
x = np.linspace(0,10,202)

mean = 0
std = 1 
num_samples = len(x)
np.random.seed(784659321) # in case you want repeatability of results
noise = np.random.normal(mean, std, size=num_samples)

y = a*x +b +noise
In [3]:
#refactoring the data for use with scikit-learn
reg_x=x.reshape(len(x),1) #linear regression model expects a 2D matrix
reg_y=y.reshape(len(y),1) #linear regression model expects a 2D matrix
model = LinearRegression()
model.fit(reg_x,reg_y)
pred_y = model.predict(reg_x)
In [4]:
#fig, ax = plt.subplots(figsize=(8, 5))
plt.plot(x,y,'.')
plt.plot(reg_x,pred_y)
plt.ylabel('y')
plt.xlabel('x')
plt.title('Regression coefficients: {:.2f}, {:.2f}. R^2 = {:.3f}'
             .format(model.coef_[0][0],model.intercept_[0],model.score(reg_x, reg_y)))
plt.show()

Calculating the F-statistic and a p-value

In [5]:
#variance with numpy
print(np.var(y))
print(sum([(x-np.mean(y))**2 for x in y])/len(y))
#vectorized operation
sum((y-np.mean(y))**2)/len(y)
12.870345008370341
12.870345008370332
Out[5]:
12.870345008370332
In [6]:
#calculating the F-statistic and p-value
N=len(y)
res=[(y[i]-pred_y[i])**2 for i in range(N)]
Fstatistic=((np.var(y)*N - sum(res))/(2-1))/(sum(res)/(N-2))
#the survival function for F, (1-cdf)
p_fit=2
p_mean=1

critical=f.ppf(.999,p_fit-p_mean,N-p_fit)#regression and residual deg of freedom
print('\nThe F-test critical p-value for a 99.9% significance level\nwith {:} and {:} degrees of freedom is {:.2f}'.format(
    p_fit-p_mean,N-p_fit,critical))
print('\nsince {:.2f} > {:.2f}, the critical p-value, we reject the hyp. that\nthe regression model could occur by chance\n(therefore the model IS significant)'.format(
Fstatistic[0],critical))
The F-test critical p-value for a 99.9% significance level
with 1 and 200 degrees of freedom is 11.15

since 2610.10 > 11.15, the critical p-value, we reject the hyp. that
the regression model could occur by chance
(therefore the model IS significant)
  • if we add a lot more noise to the data, and we recalculate R^2 and the p-value
In [7]:
#changing the noise levels
np.random.seed(784659321) 
#or import random;random.seed(value) to avoid repeating line above
noise = np.random.normal(mean, 25*std, size=num_samples)
z = a*x +b +noise
reg_z=z.reshape(len(z),1) #linear regression model expects a 2D matrix
model.fit(reg_x,reg_z)
pred_z = model.predict(reg_x)
plt.plot(x,z,'.')
plt.plot(reg_x,pred_z)
plt.ylabel('z')
plt.xlabel('x')
plt.title('Regression coefficients: {:.2f}, {:.2f}. R^2 = {:.3f}'
             .format(model.coef_[0][0],model.intercept_[0],model.score(reg_x, reg_z)))
plt.show()
In [8]:
#calculating the F-statistic and p-value
res=[(z[i]-pred_z[i])**2 for i in range(N)]
Fstatistic=((np.var(z)*N - sum(res))/(2-1))/(sum(res)/(N-2))
#the survival function for F, (1-cdf)
p_fit=2
p_mean=1

critical=f.ppf(.95,p_fit-p_mean,N-p_fit)#regression and residual deg of freedom
print('\nThe F-test critical p-value for a 95% significance level\nwith {:} and {:} degrees of freedom is {:.2f}'.format(
    p_fit-p_mean,N-p_fit,critical))
print('\nsince {:.2f} < {:.2f}, the critical p-value, we CAN\'T reject the hyp. that\nthis regression model could occur by chance\n(the model is NOT significant)'.format(
Fstatistic[0],critical))
The F-test critical p-value for a 95% significance level
with 1 and 200 degrees of freedom is 3.89

since 2.91 < 3.89, the critical p-value, we CAN'T reject the hyp. that
this regression model could occur by chance
(the model is NOT significant)